knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(tidyverse)
library(lsa)
library(readxl)
library(plotly)
df <- read_excel('../../data/country_portfolios_WOS.xlsx')
countries_keep <- df %>% group_by(eregroupement) %>% 
  summarise(N=sum(N)) %>% 
  filter(N>1000) %>% 
  pull(eregroupement)

df <- df %>% 
  filter(eregroupement %in% countries_keep)
df_clean <- df %>% 
  select(-Ediscipline) %>% 
  group_by(Type,Especialite,eregroupement) %>% 
  summarise(N = sum(N)) %>% 
  ungroup() %>% 
  mutate(Especialite = factor(Especialite),
         eregroupement = factor(eregroupement)) %>% 
     complete(Type,Especialite,eregroupement,fill=list(N=0))

#filter empty topics

empty_topics <- df_clean %>% 
  group_by(Especialite,eregroupement) %>% 
  summarise(N= sum(N)) %>% 
  filter(N==0)

df_clean2 <- df_clean %>% 
  anti_join(empty_topics)
df_clean <- df_clean %>% 
  group_by(Type,eregroupement) %>% 
  reframe(Especialite,
          N,
          p = N/sum(N))

df_clean2 <- df_clean2 %>% 
  group_by(Type,eregroupement) %>% 
  reframe(Especialite,
          N,
          p = N/sum(N))
cossims <- function(df){
  df %>% 
  select(-eregroupement,-N) %>% 
  pivot_wider(names_from = Type,values_from = p) %>% 
  summarise(cos_IM = cosine(International,Mobile),
            cos_IN = cosine(International,National),
            cos_MN = cosine(Mobile,National))
}
cosine_sim <- df_clean %>% 
  group_by(eregroupement) %>% 
  cossims()

size versus change

ggplotly(
df %>% group_by(eregroupement) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim) %>% 
  pivot_longer(cols = cos_IM:cos_MN,names_to = 'relation',values_to = 'cosine',names_prefix = 'cos_') %>% 
  mutate(relation = case_when(relation=='IM' ~'International-Mobile',
                              relation=='IN' ~'International-National',
                              relation=='MN' ~'Mobile-National',
                              )) %>% 
  filter(eregroupement!='ZZALL') %>% 
  ggplot(aes(N, cosine, color=relation, label=eregroupement)) +
  geom_point()+
  scale_x_log10()
)

Try to normalize size

compare all papers vs all papers minus (international/Mobile/National)

cossims_diff <- function(df){
  df %>%
  select(-eregroupement,-p) %>% 
  pivot_wider(id_cols = c(eregroupement,Especialite),names_from = Type,values_from = N) %>% 
  mutate(All = International + Mobile + National,
         International = All - International,
         Mobile = All - Mobile,
         National = All - National) %>% 
  mutate(International = International/sum(International),
         Mobile = Mobile/sum(Mobile),
         National = National/sum(National)) %>% 
  summarise(International = cosine(International,All),
            National = cosine(National,All),
            Mobile = cosine(Mobile,All))
}
cosine_sim_diff <- df_clean %>% 
  group_by(eregroupement) %>% 
  cossims_diff()
ggplotly(
df %>% group_by(eregroupement) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim_diff) %>% 
  pivot_longer(cols = International:Mobile,names_to = 'relation',values_to = 'cosine') %>% 
  filter(eregroupement!='ZZALL') %>% 
  ggplot(aes(N, cosine, color=relation, label=eregroupement)) +
  geom_point()+
  scale_x_log10()
)

group sizes

ggplotly(
df %>% group_by(eregroupement, Type) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim_diff %>% pivot_longer(International:Mobile, names_to='Type', values_to='cosine')) %>% 
  # pivot_longer(cols = International:Mobile,names_to = 'relation',values_to = 'cosine') %>% 
  filter(eregroupement!='ZZALL') %>% 
  group_by(eregroupement) %>% 
  mutate(country_N = sum(N)) %>% 
  ggplot(aes(country_N, cosine, color=Type,size=N, label=eregroupement)) +
  geom_point()+
  scale_x_log10()
)
NA
cosine_sim_diff <- df_clean %>% 
  group_by(eregroupement) %>% 
  cossims_diff()
cosine_sim_diff2 <- df_clean2 %>% 
  group_by(eregroupement) %>% 
  cossims_diff()

cosine_sim_diff %>% select(-eregroupement)-
cosine_sim_diff2 %>% select(-eregroupement)
NA

cosine_sim_diff2 <- df_clean2 %>% 
  group_by(eregroupement) %>% 
  cossims_diff()

ggplotly(
df %>% group_by(eregroupement, Type) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim_diff2 %>% pivot_longer(International:Mobile, names_to='Type', values_to='cosine')) %>% 
  # pivot_longer(cols = International:Mobile,names_to = 'relation',values_to = 'cosine') %>% 
  filter(eregroupement!='ZZALL') %>% 
  group_by(eregroupement) %>% 
  mutate(country_N = sum(N)) %>% 
  ggplot(aes(country_N, cosine, color=Type,size=N, label=eregroupement)) +
  geom_point()+
  scale_x_log10()
)

X-axis with the gini of the concentration of topics

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyIHNldHVwfQprbml0cjo6b3B0c19jaHVuayRzZXQod2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0UpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGxzYSkKbGlicmFyeShyZWFkeGwpCmxpYnJhcnkocGxvdGx5KQpgYGAKCgpgYGB7cn0KZGYgPC0gcmVhZF9leGNlbCgnLi4vLi4vZGF0YS9jb3VudHJ5X3BvcnRmb2xpb3NfV09TLnhsc3gnKQpgYGAKCmBgYHtyfQpjb3VudHJpZXNfa2VlcCA8LSBkZiAlPiUgZ3JvdXBfYnkoZXJlZ3JvdXBlbWVudCkgJT4lIAogIHN1bW1hcmlzZShOPXN1bShOKSkgJT4lIAogIGZpbHRlcihOPjEwMDApICU+JSAKICBwdWxsKGVyZWdyb3VwZW1lbnQpCgpkZiA8LSBkZiAlPiUgCiAgZmlsdGVyKGVyZWdyb3VwZW1lbnQgJWluJSBjb3VudHJpZXNfa2VlcCkKYGBgCgpgYGB7cn0KZGZfY2xlYW4gPC0gZGYgJT4lIAogIHNlbGVjdCgtRWRpc2NpcGxpbmUpICU+JSAKICBncm91cF9ieShUeXBlLEVzcGVjaWFsaXRlLGVyZWdyb3VwZW1lbnQpICU+JSAKICBzdW1tYXJpc2UoTiA9IHN1bShOKSkgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgbXV0YXRlKEVzcGVjaWFsaXRlID0gZmFjdG9yKEVzcGVjaWFsaXRlKSwKICAgICAgICAgZXJlZ3JvdXBlbWVudCA9IGZhY3RvcihlcmVncm91cGVtZW50KSkgJT4lIAogICAgIGNvbXBsZXRlKFR5cGUsRXNwZWNpYWxpdGUsZXJlZ3JvdXBlbWVudCxmaWxsPWxpc3QoTj0wKSkKCiNmaWx0ZXIgZW1wdHkgdG9waWNzCgplbXB0eV90b3BpY3MgPC0gZGZfY2xlYW4gJT4lIAogIGdyb3VwX2J5KEVzcGVjaWFsaXRlLGVyZWdyb3VwZW1lbnQpICU+JSAKICBzdW1tYXJpc2UoTj0gc3VtKE4pKSAlPiUgCiAgZmlsdGVyKE49PTApCgpkZl9jbGVhbjIgPC0gZGZfY2xlYW4gJT4lIAogIGFudGlfam9pbihlbXB0eV90b3BpY3MpCmBgYAoKYGBge3J9CmRmX2NsZWFuIDwtIGRmX2NsZWFuICU+JSAKICBncm91cF9ieShUeXBlLGVyZWdyb3VwZW1lbnQpICU+JSAKICByZWZyYW1lKEVzcGVjaWFsaXRlLAogICAgICAgICAgTiwKICAgICAgICAgIHAgPSBOL3N1bShOKSkKCmRmX2NsZWFuMiA8LSBkZl9jbGVhbjIgJT4lIAogIGdyb3VwX2J5KFR5cGUsZXJlZ3JvdXBlbWVudCkgJT4lIAogIHJlZnJhbWUoRXNwZWNpYWxpdGUsCiAgICAgICAgICBOLAogICAgICAgICAgcCA9IE4vc3VtKE4pKQpgYGAKCgpgYGB7cn0KY29zc2ltcyA8LSBmdW5jdGlvbihkZil7CiAgZGYgJT4lIAogIHNlbGVjdCgtZXJlZ3JvdXBlbWVudCwtTikgJT4lIAogIHBpdm90X3dpZGVyKG5hbWVzX2Zyb20gPSBUeXBlLHZhbHVlc19mcm9tID0gcCkgJT4lIAogIHN1bW1hcmlzZShjb3NfSU0gPSBjb3NpbmUoSW50ZXJuYXRpb25hbCxNb2JpbGUpLAogICAgICAgICAgICBjb3NfSU4gPSBjb3NpbmUoSW50ZXJuYXRpb25hbCxOYXRpb25hbCksCiAgICAgICAgICAgIGNvc19NTiA9IGNvc2luZShNb2JpbGUsTmF0aW9uYWwpKQp9CmBgYAoKCmBgYHtyfQpjb3NpbmVfc2ltIDwtIGRmX2NsZWFuICU+JSAKICBncm91cF9ieShlcmVncm91cGVtZW50KSAlPiUgCiAgY29zc2ltcygpCmBgYAoKIyMgc2l6ZSB2ZXJzdXMgY2hhbmdlCgpgYGB7ciwgZmlnLmhlaWdodD0xMiwgZmlnLndpZHRoPTEyfQpnZ3Bsb3RseSgKZGYgJT4lIGdyb3VwX2J5KGVyZWdyb3VwZW1lbnQpICU+JSAKICBzdW1tYXJpc2UoTj1zdW0oTikpICU+JSAKICByaWdodF9qb2luKGNvc2luZV9zaW0pICU+JSAKICBwaXZvdF9sb25nZXIoY29scyA9IGNvc19JTTpjb3NfTU4sbmFtZXNfdG8gPSAncmVsYXRpb24nLHZhbHVlc190byA9ICdjb3NpbmUnLG5hbWVzX3ByZWZpeCA9ICdjb3NfJykgJT4lIAogIG11dGF0ZShyZWxhdGlvbiA9IGNhc2Vfd2hlbihyZWxhdGlvbj09J0lNJyB+J0ludGVybmF0aW9uYWwtTW9iaWxlJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmVsYXRpb249PSdJTicgfidJbnRlcm5hdGlvbmFsLU5hdGlvbmFsJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmVsYXRpb249PSdNTicgfidNb2JpbGUtTmF0aW9uYWwnLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICApKSAlPiUgCiAgZmlsdGVyKGVyZWdyb3VwZW1lbnQhPSdaWkFMTCcpICU+JSAKICBnZ3Bsb3QoYWVzKE4sIGNvc2luZSwgY29sb3I9cmVsYXRpb24sIGxhYmVsPWVyZWdyb3VwZW1lbnQpKSArCiAgZ2VvbV9wb2ludCgpKwogIHNjYWxlX3hfbG9nMTAoKQopCmBgYAoKIyMgVHJ5IHRvIG5vcm1hbGl6ZSBzaXplCiMjIGNvbXBhcmUgYWxsIHBhcGVycyB2cyBhbGwgcGFwZXJzIG1pbnVzIChpbnRlcm5hdGlvbmFsL01vYmlsZS9OYXRpb25hbCkKCgpgYGB7cn0KY29zc2ltc19kaWZmIDwtIGZ1bmN0aW9uKGRmKXsKICBkZiAlPiUKICBzZWxlY3QoLWVyZWdyb3VwZW1lbnQsLXApICU+JSAKICBwaXZvdF93aWRlcihpZF9jb2xzID0gYyhlcmVncm91cGVtZW50LEVzcGVjaWFsaXRlKSxuYW1lc19mcm9tID0gVHlwZSx2YWx1ZXNfZnJvbSA9IE4pICU+JSAKICBtdXRhdGUoQWxsID0gSW50ZXJuYXRpb25hbCArIE1vYmlsZSArIE5hdGlvbmFsLAogICAgICAgICBJbnRlcm5hdGlvbmFsID0gQWxsIC0gSW50ZXJuYXRpb25hbCwKICAgICAgICAgTW9iaWxlID0gQWxsIC0gTW9iaWxlLAogICAgICAgICBOYXRpb25hbCA9IEFsbCAtIE5hdGlvbmFsKSAlPiUgCiAgbXV0YXRlKEludGVybmF0aW9uYWwgPSBJbnRlcm5hdGlvbmFsL3N1bShJbnRlcm5hdGlvbmFsKSwKICAgICAgICAgTW9iaWxlID0gTW9iaWxlL3N1bShNb2JpbGUpLAogICAgICAgICBOYXRpb25hbCA9IE5hdGlvbmFsL3N1bShOYXRpb25hbCkpICU+JSAKICBzdW1tYXJpc2UoSW50ZXJuYXRpb25hbCA9IGNvc2luZShJbnRlcm5hdGlvbmFsLEFsbCksCiAgICAgICAgICAgIE5hdGlvbmFsID0gY29zaW5lKE5hdGlvbmFsLEFsbCksCiAgICAgICAgICAgIE1vYmlsZSA9IGNvc2luZShNb2JpbGUsQWxsKSkKfQpgYGAKCmBgYHtyfQpjb3NpbmVfc2ltX2RpZmYgPC0gZGZfY2xlYW4gJT4lIAogIGdyb3VwX2J5KGVyZWdyb3VwZW1lbnQpICU+JSAKICBjb3NzaW1zX2RpZmYoKQpgYGAKCmBgYHtyfQpnZ3Bsb3RseSgKZGYgJT4lIGdyb3VwX2J5KGVyZWdyb3VwZW1lbnQpICU+JSAKICBzdW1tYXJpc2UoTj1zdW0oTikpICU+JSAKICByaWdodF9qb2luKGNvc2luZV9zaW1fZGlmZikgJT4lIAogIHBpdm90X2xvbmdlcihjb2xzID0gSW50ZXJuYXRpb25hbDpNb2JpbGUsbmFtZXNfdG8gPSAncmVsYXRpb24nLHZhbHVlc190byA9ICdjb3NpbmUnKSAlPiUgCiAgZmlsdGVyKGVyZWdyb3VwZW1lbnQhPSdaWkFMTCcpICU+JSAKICBnZ3Bsb3QoYWVzKE4sIGNvc2luZSwgY29sb3I9cmVsYXRpb24sIGxhYmVsPWVyZWdyb3VwZW1lbnQpKSArCiAgZ2VvbV9wb2ludCgpKwogIHNjYWxlX3hfbG9nMTAoKQopCmBgYAoKCiMjIGdyb3VwIHNpemVzCgpgYGB7cn0KZ2dwbG90bHkoCmRmICU+JSBncm91cF9ieShlcmVncm91cGVtZW50LCBUeXBlKSAlPiUgCiAgc3VtbWFyaXNlKE49c3VtKE4pKSAlPiUgCiAgcmlnaHRfam9pbihjb3NpbmVfc2ltX2RpZmYgJT4lIHBpdm90X2xvbmdlcihJbnRlcm5hdGlvbmFsOk1vYmlsZSwgbmFtZXNfdG89J1R5cGUnLCB2YWx1ZXNfdG89J2Nvc2luZScpKSAlPiUgCiAgIyBwaXZvdF9sb25nZXIoY29scyA9IEludGVybmF0aW9uYWw6TW9iaWxlLG5hbWVzX3RvID0gJ3JlbGF0aW9uJyx2YWx1ZXNfdG8gPSAnY29zaW5lJykgJT4lIAogIGZpbHRlcihlcmVncm91cGVtZW50IT0nWlpBTEwnKSAlPiUgCiAgZ3JvdXBfYnkoZXJlZ3JvdXBlbWVudCkgJT4lIAogIG11dGF0ZShjb3VudHJ5X04gPSBzdW0oTikpICU+JSAKICBnZ3Bsb3QoYWVzKGNvdW50cnlfTiwgY29zaW5lLCBjb2xvcj1UeXBlLHNpemU9TiwgbGFiZWw9ZXJlZ3JvdXBlbWVudCkpICsKICBnZW9tX3BvaW50KCkrCiAgc2NhbGVfeF9sb2cxMCgpCikKCmBgYApgYGB7cn0KY29zaW5lX3NpbV9kaWZmIDwtIGRmX2NsZWFuICU+JSAKICBncm91cF9ieShlcmVncm91cGVtZW50KSAlPiUgCiAgY29zc2ltc19kaWZmKCkKY29zaW5lX3NpbV9kaWZmMiA8LSBkZl9jbGVhbjIgJT4lIAogIGdyb3VwX2J5KGVyZWdyb3VwZW1lbnQpICU+JSAKICBjb3NzaW1zX2RpZmYoKQoKY29zaW5lX3NpbV9kaWZmICU+JSBzZWxlY3QoLWVyZWdyb3VwZW1lbnQpLQpjb3NpbmVfc2ltX2RpZmYyICU+JSBzZWxlY3QoLWVyZWdyb3VwZW1lbnQpCgpgYGAKCgpgYGB7cn0KCmNvc2luZV9zaW1fZGlmZjIgPC0gZGZfY2xlYW4yICU+JSAKICBncm91cF9ieShlcmVncm91cGVtZW50KSAlPiUgCiAgY29zc2ltc19kaWZmKCkKCmdncGxvdGx5KApkZiAlPiUgZ3JvdXBfYnkoZXJlZ3JvdXBlbWVudCwgVHlwZSkgJT4lIAogIHN1bW1hcmlzZShOPXN1bShOKSkgJT4lIAogIHJpZ2h0X2pvaW4oY29zaW5lX3NpbV9kaWZmMiAlPiUgcGl2b3RfbG9uZ2VyKEludGVybmF0aW9uYWw6TW9iaWxlLCBuYW1lc190bz0nVHlwZScsIHZhbHVlc190bz0nY29zaW5lJykpICU+JSAKICAjIHBpdm90X2xvbmdlcihjb2xzID0gSW50ZXJuYXRpb25hbDpNb2JpbGUsbmFtZXNfdG8gPSAncmVsYXRpb24nLHZhbHVlc190byA9ICdjb3NpbmUnKSAlPiUgCiAgZmlsdGVyKGVyZWdyb3VwZW1lbnQhPSdaWkFMTCcpICU+JSAKICBncm91cF9ieShlcmVncm91cGVtZW50KSAlPiUgCiAgbXV0YXRlKGNvdW50cnlfTiA9IHN1bShOKSkgJT4lIAogIGdncGxvdChhZXMoY291bnRyeV9OLCBjb3NpbmUsIGNvbG9yPVR5cGUsc2l6ZT1OLCBsYWJlbD1lcmVncm91cGVtZW50KSkgKwogIGdlb21fcG9pbnQoKSsKICBzY2FsZV94X2xvZzEwKCkKKQoKYGBgCgoKCgpYLWF4aXMgd2l0aCB0aGUgZ2luaSBvZiB0aGUgY29uY2VudHJhdGlvbiBvZiB0b3BpY3M=